Exploratory Data Analysis¶

In [1]:
# import library

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from plotly import graph_objs as go
In [2]:
# import dataset

df = pd.read_csv (r"D:\Pasca Wisuda\Data analyst\My project portfolio\Dataset\chocolate_bars.csv")
df
Out[2]:
id manufacturer company_location year_reviewed bean_origin bar_name cocoa_percent num_ingredients ingredients review rating
0 2454 5150 U.S.A. 2019 Tanzania Kokoa Kamili, batch 1 76.0 3.0 B,S,C rich cocoa, fatty, bready 3.25
1 2458 5150 U.S.A. 2019 Dominican Republic Zorzal, batch 1 76.0 3.0 B,S,C cocoa, vegetal, savory 3.50
2 2454 5150 U.S.A. 2019 Madagascar Bejofo Estate, batch 1 76.0 3.0 B,S,C cocoa, blackberry, full body 3.75
3 2542 5150 U.S.A. 2021 Fiji Matasawalevu, batch 1 68.0 3.0 B,S,C chewy, off, rubbery 3.00
4 2546 5150 U.S.A. 2021 Venezuela Sur del Lago, batch 1 72.0 3.0 B,S,C fatty, earthy, moss, nutty,chalky 3.00
... ... ... ... ... ... ... ... ... ... ... ...
2525 1205 Zotter Austria 2014 Blend Raw 80.0 4.0 B,S*,C,Sa waxy, cloying, vegetal 2.75
2526 1996 Zotter Austria 2017 Colombia APROCAFA, Acandi 75.0 3.0 B,S,C strong nutty, marshmallow 3.75
2527 2036 Zotter Austria 2018 Blend Dry Aged, 30 yr Anniversary bar 75.0 3.0 B,S,C fatty, earthy, cocoa 3.00
2528 2170 Zotter Austria 2018 Congo Mountains of the Moon 70.0 3.0 B,S,C fatty, mild nuts, mild fruit 3.25
2529 2170 Zotter Austria 2018 Belize Maya Mtn 72.0 3.0 B,S,C muted, roasty, accessible 3.50

2530 rows × 11 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2530 entries, 0 to 2529
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2530 non-null   int64  
 1   manufacturer      2530 non-null   object 
 2   company_location  2530 non-null   object 
 3   year_reviewed     2530 non-null   int64  
 4   bean_origin       2530 non-null   object 
 5   bar_name          2530 non-null   object 
 6   cocoa_percent     2530 non-null   float64
 7   num_ingredients   2443 non-null   float64
 8   ingredients       2443 non-null   object 
 9   review            2530 non-null   object 
 10  rating            2530 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 217.6+ KB
In [4]:
# change data type of num_ingredients to not a decimal number (Int64) which allow missing values

df['num_ingredients']=df['num_ingredients'].astype('Int64')
In [5]:
# change data type of cocoa_percent to not decimal number

df['cocoa_percent']=df['cocoa_percent'].astype('int')
In [6]:
# df info after changing the data type

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2530 entries, 0 to 2529
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2530 non-null   int64  
 1   manufacturer      2530 non-null   object 
 2   company_location  2530 non-null   object 
 3   year_reviewed     2530 non-null   int64  
 4   bean_origin       2530 non-null   object 
 5   bar_name          2530 non-null   object 
 6   cocoa_percent     2530 non-null   int32  
 7   num_ingredients   2443 non-null   Int64  
 8   ingredients       2443 non-null   object 
 9   review            2530 non-null   object 
 10  rating            2530 non-null   float64
dtypes: Int64(1), float64(1), int32(1), int64(2), object(6)
memory usage: 210.1+ KB
In [7]:
# description for numeric data

df.describe()
Out[7]:
id year_reviewed cocoa_percent num_ingredients rating
count 2530.000000 2530.000000 2530.000000 2443.0 2530.000000
mean 1429.800791 2014.374308 71.637945 3.041343 3.196344
std 757.648556 3.968267 5.616953 0.913728 0.445321
min 5.000000 2006.000000 42.000000 1.0 1.000000
25% 802.000000 2012.000000 70.000000 2.0 3.000000
50% 1454.000000 2015.000000 70.000000 3.0 3.250000
75% 2079.000000 2018.000000 74.000000 4.0 3.500000
max 2712.000000 2021.000000 100.000000 6.0 4.000000
In [8]:
# description for object data

df.describe(include='object')
Out[8]:
manufacturer company_location bean_origin bar_name ingredients review
count 2530 2530 2530 2530 2443 2530
unique 580 67 62 1605 21 2487
top Soma U.S.A. Venezuela Madagascar B,S,C spicy, cocoa
freq 56 1136 253 55 999 4
In [9]:
# count how many duplicate data

df.duplicated().sum()
Out[9]:
0
In [10]:
# count how many unique values

df.nunique()
Out[10]:
id                   630
manufacturer         580
company_location      67
year_reviewed         16
bean_origin           62
bar_name            1605
cocoa_percent         42
num_ingredients        6
ingredients           21
review              2487
rating                12
dtype: int64
In [11]:
# count how many missing values

df.isnull().sum()
Out[11]:
id                   0
manufacturer         0
company_location     0
year_reviewed        0
bean_origin          0
bar_name             0
cocoa_percent        0
num_ingredients     87
ingredients         87
review               0
rating               0
dtype: int64
In [12]:
# look unique values for column with missing values

df['num_ingredients'].unique()
Out[12]:
<IntegerArray>
[3, 4, 2, 5, 6, <NA>, 1]
Length: 7, dtype: Int64
In [13]:
# look uniques values for column with missing  values

df['ingredients'].unique()
Out[13]:
array(['B,S,C', 'B,S,C,L', 'B,S', 'B,S,C,V', 'B,S,C,V,L', 'B,S,C,V,L,Sa',
       'B,S,C,V,Sa', nan, 'B,S,V,L', 'B,S*', 'B', 'B,S*,C', 'B,S,L',
       'B,S,V', 'B,S*,C,L', 'B,S*,C,Sa', 'B,S*,Sa', 'B,S,C,Sa',
       'B,S*,V,L', 'B,C', 'B,S*,C,V', 'B,S,C,L,Sa'], dtype=object)
In [14]:
df.columns
Out[14]:
Index(['id', 'manufacturer', 'company_location', 'year_reviewed',
       'bean_origin', 'bar_name', 'cocoa_percent', 'num_ingredients',
       'ingredients', 'review', 'rating'],
      dtype='object')
In [15]:
# histogram for numeric data

df.hist(grid=False, layout=(5,4), figsize=(20,15), bins=15)
plt.show()
In [16]:
# top 10 manufacturers with highest number

df['manufacturer'].value_counts().head(10)
Out[16]:
manufacturer
Soma         56
Fresco       39
Arete        32
Bonnat       30
A. Morin     26
Dandelion    25
Pralus       25
Domori       23
Valrhona     22
Guittard     22
Name: count, dtype: int64
In [17]:
# Bar plot visualization for top 10 chocolate bar manufacturer

mnf = df['manufacturer'].value_counts().head(10).plot(kind='bar', color='#45B39D', figsize = (6,4), 
                                                      title='Top 10 Chocolate Bar Manufacturer')

mnf.set_xlabel('Manufacturer')

mnf.set_ylabel('Count')

plt.show
Out[17]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [18]:
# top 10 highest company locations

df['company_location'].value_counts().head(10)
Out[18]:
company_location
U.S.A.         1136
Canada          177
France          176
U.K.            133
Italy            78
Belgium          63
Ecuador          58
Australia        53
Switzerland      44
Germany          42
Name: count, dtype: int64
In [19]:
# top 10 bean origin

df['bean_origin'].value_counts().head(10)
Out[19]:
bean_origin
Venezuela             253
Peru                  244
Dominican Republic    226
Ecuador               219
Madagascar            177
Blend                 156
Nicaragua             100
Bolivia                80
Tanzania               79
Colombia               79
Name: count, dtype: int64
In [20]:
# count of rating for all the samples

df['rating'].value_counts()
Out[20]:
rating
3.50    565
3.00    523
3.25    464
2.75    333
3.75    300
2.50    166
4.00    112
2.00     33
2.25     17
1.50     10
1.00      4
1.75      3
Name: count, dtype: int64
In [21]:
# scatter plot for rating vs number of ingredients

sct = sns.scatterplot(x='rating', y='num_ingredients', data=df)

sct.set_title('Rating vs Number of Ingredients')

plt.show()
In [22]:
# make correlation for numeric data only

df_corr=df.dropna().corr(method='pearson', numeric_only=True)

df_corr
Out[22]:
id year_reviewed cocoa_percent num_ingredients rating
id 1.000000 0.993091 0.047619 -0.356845 0.085569
year_reviewed 0.993091 1.000000 0.048085 -0.362956 0.087774
cocoa_percent 0.047619 0.048085 1.000000 -0.187792 -0.076925
num_ingredients -0.356845 -0.362956 -0.187792 1.000000 -0.098834
rating 0.085569 0.087774 -0.076925 -0.098834 1.000000
In [23]:
# heatmap to show correlation for numeric data only

heatmap_corr = px.imshow(df_corr, color_continuous_scale = 'YlGnBu', text_auto='.2f', aspect= 'auto',
                    title='Correlation Between All numeric columns')

heatmap_corr.show()
In [24]:
# make a copy of dataframe

df2 = df.copy()
In [25]:
# numerized all the columns to get the correlation for each columns

df_numerized = df2

for col_name in df_numerized.columns:
    if (df_numerized[col_name].dtype=='object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes

df_numerized
Out[25]:
id manufacturer company_location year_reviewed bean_origin bar_name cocoa_percent num_ingredients ingredients review rating
0 2454 0 62 2019 52 750 76 3 10 1679 3.25
1 2458 0 62 2019 13 1597 76 3 10 319 3.50
2 2454 0 62 2019 28 162 76 3 10 288 3.75
3 2542 0 62 2021 16 935 68 3 10 229 3.00
4 2546 0 62 2021 60 1418 72 3 10 742 3.00
... ... ... ... ... ... ... ... ... ... ... ...
2525 1205 573 3 2014 2 1249 80 4 6 2452 2.75
2526 1996 573 3 2017 8 11 75 3 10 2220 3.75
2527 2036 573 3 2018 2 487 75 3 10 741 3.00
2528 2170 573 3 2018 9 1008 70 3 10 756 3.25
2529 2170 573 3 2018 1 953 72 3 10 1396 3.50

2530 rows × 11 columns

In [26]:
# correlation table for all the data

df_corr2=df_numerized.dropna().corr(method='pearson')

df_corr2
Out[26]:
id manufacturer company_location year_reviewed bean_origin bar_name cocoa_percent num_ingredients ingredients review rating
id 1.000000 0.020080 0.126790 0.993091 0.058748 0.037507 0.047619 -0.356845 -0.278015 -0.035936 0.085569
manufacturer 0.020080 1.000000 -0.089814 0.018340 -0.029630 0.001810 0.037788 0.050261 0.030292 0.021144 -0.025883
company_location 0.126790 -0.089814 1.000000 0.135263 0.080599 0.028114 0.029851 -0.216424 -0.199825 -0.016504 -0.073725
year_reviewed 0.993091 0.018340 0.135263 1.000000 0.056912 0.035722 0.048085 -0.362956 -0.281374 -0.034769 0.087774
bean_origin 0.058748 -0.029630 0.080599 0.056912 1.000000 0.190436 -0.009190 -0.031379 -0.006710 -0.027607 0.037417
bar_name 0.037507 0.001810 0.028114 0.035722 0.190436 1.000000 -0.026203 -0.015410 -0.018399 -0.014992 0.000376
cocoa_percent 0.047619 0.037788 0.029851 0.048085 -0.009190 -0.026203 1.000000 -0.187792 -0.142607 -0.097574 -0.076925
num_ingredients -0.356845 0.050261 -0.216424 -0.362956 -0.031379 -0.015410 -0.187792 1.000000 0.892510 0.032839 -0.098834
ingredients -0.278015 0.030292 -0.199825 -0.281374 -0.006710 -0.018399 -0.142607 0.892510 1.000000 0.025908 -0.067373
review -0.035936 0.021144 -0.016504 -0.034769 -0.027607 -0.014992 -0.097574 0.032839 0.025908 1.000000 -0.095686
rating 0.085569 -0.025883 -0.073725 0.087774 0.037417 0.000376 -0.076925 -0.098834 -0.067373 -0.095686 1.000000
In [27]:
# heatmap of correlation between all the data

heatmap_corr2 = px.imshow(df_corr2, color_continuous_scale = 'YlGnBu', text_auto='.2f', aspect= 'auto',
                    title='Correlation Between All the columns')

heatmap_corr2.show()

year vs id, ingredients vs num_ingredients are columns that have high positive correlation each other.

Ask questions about the data¶

Questions

  1. What bean origin and company of chocolate generate high rating?
  2. What are the characteristics of the highest (4.00) rating chocolate bar?
  3. What characteristics of chocolate bar with highest rating (4.00) based on ingredient?
  4. What is the detailed information about outstanding, highly recommended, recommended, disappointing, and unpleasant flavor chocolate bar?
  5. What is the detailed information about unpleasant flavor chocolate bar?
  6. What rating trends by year?
  7. What is the correlation between percentage of cocoa with rating?
  8. What is the correlation between bean origin vs rating?
  9. What insights can we gain from Indonesia bean origin?
  10. What most ingredient effect rating of chocolate bar?
In [28]:
# Question 1. What bean origin and company of chocolate generate high rating?

high_rating_by_origin = df.groupby(['bean_origin'])[['rating','company_location']].max().sort_values(by='rating', ascending=False).reset_index()

high_rating_by_origin
Out[28]:
bean_origin rating company_location
0 Mexico 4.00 U.S.A.
1 Papua New Guinea 4.00 U.S.A.
2 Haiti 4.00 U.S.A.
3 Indonesia 4.00 U.S.A.
4 Jamaica 4.00 U.S.A.
... ... ... ...
57 Sri Lanka 3.00 Sweden
58 Sierra Leone 3.00 U.S.A.
59 Principe 2.75 Hungary
60 St.Vincent-Grenadines 2.75 St.Vincent-Grenadines
61 Martinique 2.75 Martinique

62 rows × 3 columns

Show all the bean origin with highest rating and company location. There are 62 origins with the highest rating (4.00) from Mexico, Papua New Guinea, Haiti, Indonesia, Jamaica, etc.

In [29]:
# create bar chart to visualize highest rating by origin

bar1 = px.bar(high_rating_by_origin, x='bean_origin', y='rating', color='company_location', 
              color_discrete_sequence= ['#40E0D0', '#CD5C5C', '#1A5276','#6495ED', '#CCCCFF', '#FF5733', '#8E44AD', 
                                        '#D68910', '#0B5345', '#839192', '#F7DC6F', '#BFC9CA', px.colors.qualitative.Antique[0],
                                        px.colors.qualitative.Set3[4], px.colors.qualitative.Set3[9], px.colors.qualitative.T10[7],
                                       px.colors.qualitative.D3[0], px.colors.qualitative.Set3[6], px.colors.qualitative.T10[1],
                                       px.colors.qualitative.Set3[0]],
              title='Highest Rating by Bean Origin', height=700,
              labels={'bean_origin': 'Bean Origin', 'rating': 'Highest Rating', 'company_location':'Company Location'})

bar1.show()

Bar chart showed that the highest rating chocolate bar by bean origin mostly produced from U.S.A company. Other company locations which produced 4.00 rating are U.K, France, Venezuela, Vietnam, and Wales.

In [30]:
#color_continuous_scale = 'YlGnBu'
fig = px.colors.sequential.swatches_continuous()
#fig.show()
In [317]:
#color_discrete_sequence= [px.colors.qualitative.Pastel2[0],px.colors.qualitative.Pastel2[1]]
fig = px.colors.qualitative.swatches()
#fig.show()
In [31]:
# Question 2. What are the characteristics of the highest (4.00) rating chocolate bar?

# show detail values for rating = 4.00

highest_rating = df[df['rating']==4.00]

highest_rating_clean = highest_rating.drop(columns= ['id', 'company_location', 'year_reviewed', 'bar_name']).sort_values(by='cocoa_percent', ascending=False).reset_index(drop=True).rename_axis("Index", axis=0)
highest_rating_clean
Out[31]:
manufacturer bean_origin cocoa_percent num_ingredients ingredients review rating
Index
0 Soma Blend 88 3 B,S,C pronounced berry,mild smoke 4.0
1 Scharffen Berger Peru 78 4 B,S,C,L banana, pear, spice, cheese 4.0
2 Bonnat Venezuela 75 3 B,S,C creamy, fatty, floral 4.0
3 Pralus Sao Tome 75 4 B,S,C,L large nibs, spicy, cocoa 4.0
4 Bonnat Blend 75 3 B,S,C creamy, rich, complex 4.0
... ... ... ... ... ... ... ...
107 Valrhona Madagascar 64 5 B,S,C,V,L creamy, blueberry, raspberry 4.0
108 A. Morin Peru 63 3 B,S,C sweet, cocoa, tangerine 4.0
109 Valrhona Peru 63 5 B,S,C,V,L intense, dark berry, grapes 4.0
110 Madecasse (Cinagra) Madagascar 63 5 B,S,C,V,L intense, nutty, cocoa 4.0
111 AMMA Brazil 60 4 B,S,C,L creamy, sweet,cocoa,banana 4.0

112 rows × 7 columns

There are 112 chocolate bars with 4.00 rating

In [32]:
# Characteristics of 4.00 rating chocolate bar

df_characteristics = highest_rating['review'].str.split(',',expand=True).stack().value_counts().reset_index()
df_characteristics.columns = ['Characteristics', 'Count']
df_characteristics
Out[32]:
Characteristics Count
0 creamy 34
1 cocoa 21
2 nutty 15
3 spicy 6
4 complex 5
... ... ...
155 cherry 1
156 mild strawberry 1
157 molasses 1
158 bright fruit 1
159 cinamon 1

160 rows × 2 columns

There are 160 characteristics of chocolate bar which produced 4.00 rating

In [33]:
# bar chart visualization of characteristics of 4.00 rating chocolate bar

bar2 = px.bar(df_characteristics, x='Characteristics', y='Count', color = 'Count', color_continuous_scale = 'Sunset',
              title='Characteristics of Highest Rating Chocolate Bar', height=700,
              labels={'Characteristics': 'Characteristics', 'Count': 'Count'})

bar2.show()

The most characteristics of 4.00 rating chocolate bar are creamy, cocoa, and nutty with 34, 21, 15 amount.

In [34]:
# Question 3. What characteristics of chocolate bar with highest rating (4.00) based on ingredient?

# Show Ingredients data which produced 4.00 rating

highest_rating_ingredients = highest_rating_clean['ingredients'].value_counts().reset_index()
highest_rating_ingredients.columns = ['Ingredients', 'Count']
highest_rating_ingredients
Out[34]:
Ingredients Count
0 B,S,C 58
1 B,S 26
2 B,S,C,L 14
3 B,S,C,V,L 11
4 B,S,C,V 2
In [35]:
# Visualized Ingredients produced 4.00 rating

bar3 = px.bar(highest_rating_ingredients, x='Ingredients', y='Count', color = 'Count', color_continuous_scale = 'Sunset',
              title='Ingredients of Highest Rating Chocolate Bar', height=500, width = 600,
              labels={'Ingredients': 'Ingredients', 'Count': 'Count'})

bar3.show()

Bean, Sugar, and Cocoa Butter is the most combination ingredients used that produced 4.00 rating.

Legend : B = Bean S = Sugar C = Cocoa Butter L = Lecithin V = Vanilla

In [36]:
# Try to extract all of characteristics based on ingredients with 4.00 rating

# Extract characteristics for B,S ingredients with 4.00 rating
BS = highest_rating_clean[highest_rating_clean['ingredients']=='B,S']
BS_char = BS['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BS_char.columns = ['Characteristics', 'BS_count']

# Extract characteristics for B,S,C ingredients with 4.00 rating
BSC = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C']
BSC_char = BSC['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSC_char.columns = ['Characteristics', 'BSC_count']

# Extract characteristics for B,S,C,L ingredients with 4.00 rating
BSCL = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C,L']
BSCL_char = BSCL['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSCL_char.columns = ['Characteristics', 'BSCL_count']

# Extract characteristics for B,S,C,V,L ingredients with 4.00 rating
BSCVL = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C,V,L']
BSCVL_char = BSCVL['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSCVL_char.columns = ['Characteristics', 'BSCVL_count']

# Extract characteristics for B,S,C,V ingredients with 4.00 rating
BSCV = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C,V']
BSCV_char = BSCV['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSCV_char.columns = ['Characteristics', 'BSCV_count']
In [37]:
# Merge ectracted data above become 1 dataframe

from functools import reduce
chars = [BS_char, BSC_char, BSCL_char, BSCVL_char, BSCV_char]

df_merged = reduce(lambda left,right: pd.merge(left, right, on=['Characteristics'], how='outer'), chars)
df_merged
Out[37]:
Characteristics BS_count BSC_count BSCL_count BSCVL_count BSCV_count
0 creamy 9.0 18.0 4.0 3.0 NaN
1 cocoa 4.0 11.0 3.0 3.0 NaN
2 sticky 3.0 NaN NaN NaN NaN
3 smooth 2.0 NaN NaN NaN NaN
4 dried fruit 2.0 2.0 NaN NaN NaN
... ... ... ... ... ... ...
154 green NaN NaN NaN 1.0 NaN
155 tart citrus NaN NaN NaN 1.0 NaN
156 long and rich NaN NaN NaN 1.0 NaN
157 strong spice NaN NaN NaN NaN 1.0
158 intense pepper NaN NaN NaN NaN 1.0

159 rows × 6 columns

In [38]:
# Stacked bar chart to visualize characteristics of all ingredients having 4.00 rating

fig5 = go.Figure(data=[
    go.Bar(name='B,S (Bean,Sugar)', x=df_merged.Characteristics, y=df_merged.BS_count),
    go.Bar(name='B,S,C (Bean,Sugar,Cocoa Butter)', x=df_merged.Characteristics, y=df_merged.BSC_count),
    go.Bar(name='B,S,C,L (Bean,Sugar,Cocoa Butter,Lecithin)', x=df_merged.Characteristics, y=df_merged.BSCL_count),
    go.Bar(name='B,S,C,V,L (Bean,Sugar,Cocoa Butter,Vanilla,Lecithin)', x=df_merged.Characteristics, y=df_merged.BSCVL_count),
    go.Bar(name='B,S,C,V (Bean,Sugar,Cocoa,Vanilla)', x=df_merged.Characteristics, y=df_merged.BSCV_count)
])

fig5.update_layout(barmode='stack', height=600, width=1000, plot_bgcolor='#FFFFFF',
                   title='Characteristics of Highest Rating Chocolate Bar by Ingredients',
                   xaxis_title= 'Characteristics',
                   yaxis_title= 'Count',
                   legend_title='Ingredients',
                   legend= dict(x=0.55, y=1.0, bgcolor='rgba(255, 255, 255, 0)',
                                bordercolor='rgba(255, 255, 255, 0)'),
                  )
fig5.show()

Stacked bar chart showed the composition each characterisctics based on ingredients

In [39]:
# characteristics in chocolate bar of B,S ingredients with 4.00 rating

BSR = highest_rating_clean[highest_rating_clean['ingredients']=='B,S']
BSR_char = BSR['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSR_char.columns = ['Characteristics', 'BS_count']
#BSR_char

BSR2 = BSR_char.to_string(index=False).strip()
BSR2
Out[39]:
'Characteristics  BS_count\n                   creamy         9\n                    cocoa         4\n                   sticky         3\n                   smooth         2\n              dried fruit         2\n                   coffee         2\n                    nutty         2\n                   cherry         2\n                  complex         2\n                     tart         2\n                     tart         1\n                    woody         1\n                    choco         1\n                   sticky         1\n            peanut butter         1\n              sweet spice         1\n                   floral         1\n                   banana         1\n                   yogurt         1\n                    nutty         1\n                    fruit         1\n                     mint         1\n                   cherry         1\n                    olive         1\n                 balanced         1\n                    smoke         1\n                red berry         1\n                     nuts         1\n                  tobacco         1\n               dark berry         1\n                 balanced         1\n               astringent         1\n                   banana         1\n                   roasty         1\n                     oily         1\n                  leather         1\n             well defined         1\n                   creamy         1\n              dried fruit         1\n                      fig         1\n              marshmallow         1\n             bright fruit         1\n            distinquished         1\n                 cardamon         1\ndistinct choco and graham         1\n     chocolate and grapes         1\n                  rounded         1\n                   orange         1\n                 tropical         1\n             long lasting         1\n perfectly balanced roast         1'
In [40]:
# word cloud for BS chocolate bar ingredient with 4.00 rating

word_cloud = WordCloud(width = 600, height = 300, background_color = '#FBEEE6', max_words=51, relative_scaling = 0, min_font_size = 10)
word_cloud.generate(BSR2)
plt.imshow(word_cloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()
In [41]:
# Question 4. What is the detailed information about outstanding, highly recommended, recommended, disappointing, and unpleasant flavor chocolate bar?

# create new column to describe the rating, and drop some unused column

df_rating = df.copy()
#df_rating['flavor']=df_rating['rating']
        
df_rating.loc[df_rating['rating'] >= 1 , 'flavor'] = 'unpleasant'
df_rating.loc[df_rating['rating'] >= 2 , 'flavor'] = 'disappointing'
df_rating.loc[df_rating['rating'] >= 3 , 'flavor'] = 'recommended'
df_rating.loc[df_rating['rating'] >= 3.5 , 'flavor'] = 'highly recommended'
df_rating.loc[df_rating['rating'] >= 4 , 'flavor'] = 'outstanding'

df_rating = df_rating.drop(columns= ['id']).sort_values(by='rating', ascending=False)

df_rating
Out[41]:
manufacturer company_location year_reviewed bean_origin bar_name cocoa_percent num_ingredients ingredients review rating flavor
1829 Patric U.S.A. 2013 Peru Piura, Choc. Garage Exclusive 67 3 B,S,C creamy, cocoa, grapes 4.0 outstanding
575 Chokola U.S.A. 2020 Madagascar Bejofo, 2019 H., Batch 20 67 2 B,S cherry, perfectly balanced roast 4.0 outstanding
150 Arete U.S.A. 2018 Colombia Tumaco 70 2 B,S smooth, nutty, cocoa 4.0 outstanding
2389 Valrhona France 2007 Madagascar Manjari 64 5 B,S,C,V,L creamy, blueberry, raspberry 4.0 outstanding
2047 Scharffen Berger U.S.A. 2009 Ghana Asante 65 5 B,S,C,V,L simple, delicate cocoa, long 4.0 outstanding
... ... ... ... ... ... ... ... ... ... ... ...
1587 Middlebury U.S.A. 2012 Blend Houseblend 65 4 B,S*,C,Sa chemical, salt, wtf 1.5 unpleasant
632 Cote d' Or (Kraft) Belgium 2006 Blend Sensations Intense 70 <NA> NaN this is not chocolate 1.0 unpleasant
430 Callebaut Belgium 2007 Ecuador Baking 70 <NA> NaN bitter, cocoa 1.0 unpleasant
590 Claudio Corallo Sao Tome 2008 Blend Principe, Sao Tome & Principe 100 <NA> NaN chalky, musty, very bitter 1.0 unpleasant
1679 Neuhaus (Callebaut) Belgium 2007 Blend Dark 73 5 B,S,C,V,L pastey, strong off flavor 1.0 unpleasant

2530 rows × 11 columns

In [45]:
df_rating['flavor'].value_counts()
Out[45]:
flavor
recommended           987
highly recommended    865
disappointing         549
outstanding           112
unpleasant             17
Name: count, dtype: int64

Tabel showed the detail information (include manufacturer, bean_origin, bar_name, cocoa_percent, ingredient, review) about the flavor based on rating of chocolate bar.

In [46]:
# Question 5. What is the detailed information about unpleasant flavor chocolate bar?

df_rating[df_rating['flavor']=='unpleasant']
Out[46]:
manufacturer company_location year_reviewed bean_origin bar_name cocoa_percent num_ingredients ingredients review rating flavor
1180 Hotel Chocolat U.K. 2013 St. Lucia Island Growers, 2012, 120hr c., batch 13080 100 <NA> NaN pastey, bitter, unfixable 1.75 unpleasant
158 Artisan du Chocolat U.K. 2009 Brazil Brazil Rio Doce 72 3 B,S,C cocoa, dominate off note 1.75 unpleasant
164 Artisan du Chocolat U.K. 2010 Venezuela Venezuela 100 <NA> NaN very nutty, very bitter 1.75 unpleasant
626 Coppeneur Germany 2012 Ecuador Ecuador Puristique 100 <NA> NaN high intensity bitter 1.50 unpleasant
2383 Valrhona France 2007 Blend Le Noir Extra Amer 85 5 B,S,C,V,L creamy, fatty, very bitter 1.50 unpleasant
1423 Machu Picchu Trading Co. Peru 2011 Peru Peru 70 <NA> NaN vanilla, strong chemical 1.50 unpleasant
2118 Snake & Butterfly U.S.A. 2011 Ghana Ghana 67 3 B,S,C perfume, strong chemical 1.50 unpleasant
426 Cacaoyere (Ecuatoriana) Ecuador 2008 Ecuador Pichincha 91 4 B,S,C,L klingy, hint of fruit, very bitter 1.50 unpleasant
763 Dolfin (Belcolade) Belgium 2006 Blend Noir 70 3 B,S,L acidic, astringent, unpleasant 1.50 unpleasant
323 Bouga Cacao (Tulicorp) Ecuador 2009 Ecuador El Oro, Hacienda de Oro 100 <NA> NaN cardboard, very bitter, floral 1.50 unpleasant
2017 S.A.I.D. Italy 2010 Blend 100 percent 100 <NA> NaN sticky, intense, very bitter 1.50 unpleasant
293 Bonnat France 2006 Blend One Hundred 100 <NA> NaN acidic, bitter, dry 1.50 unpleasant
1587 Middlebury U.S.A. 2012 Blend Houseblend 65 4 B,S*,C,Sa chemical, salt, wtf 1.50 unpleasant
632 Cote d' Or (Kraft) Belgium 2006 Blend Sensations Intense 70 <NA> NaN this is not chocolate 1.00 unpleasant
430 Callebaut Belgium 2007 Ecuador Baking 70 <NA> NaN bitter, cocoa 1.00 unpleasant
590 Claudio Corallo Sao Tome 2008 Blend Principe, Sao Tome & Principe 100 <NA> NaN chalky, musty, very bitter 1.00 unpleasant
1679 Neuhaus (Callebaut) Belgium 2007 Blend Dark 73 5 B,S,C,V,L pastey, strong off flavor 1.00 unpleasant

Unpleasant flavor is chocolate bar that has rating around 1.0 - 1.9. Based on the table, unpleasant flavor mostly produced from blend bean, eventhough not all of the blend bean produced low rating of chocolate bar. Based on the correlarion matrix, data that has positive correlation with rating is year_reviewed, bean_origin. While manufacturer, company_location, cocoa percent, num_ingredient, ingredient have negative correlation with rating. This correlation number is low, the highest number of correlation is between rating vs year_reviewed (for positive) and num_ingredients (for negative).

Rating vs year_reviewed has high correlation. Based on the table, unpleasant falvor chocolate bars has old year_reviewed, it can be concluded that old year_reviewed can produced low rating chocolate bar

In [47]:
# Question 6. What rating trends by year?

fig_ry = px.scatter(df, x= 'year_reviewed', y='rating', color='year_reviewed', trendline='ols')

fig_ry.update_layout(height=500, width=800,
                   title='Scatter Plot and Regression of Rating by Year',
                   xaxis_title= 'Year',
                   yaxis_title= 'Rating'
                  )

fig_ry.show()

Scatter plot and regression line showed that rating and year has positive regression although the number is too low. But, ratings rose as the year progressed, there is no chocolate bar has rating below 2.5 in 2021, that means there is no chocolate bar with unpleasant flavor in 2021.

In [50]:
rating_year = df[['year_reviewed','rating']]

rating_year_avg = rating_year.groupby('year_reviewed').mean().reset_index()

figline_ry = px.line (rating_year_avg, x='year_reviewed', y='rating', markers=True)

figline_ry.update_layout(height=500, width=800,
                   title='Trend of Chocolate Bar Rating by Year',
                   xaxis_title= 'Year',
                   yaxis_title= 'Avg Rating'
                  )

figline_ry.show()

Based on the line chart or this trend, the average rating of chocolate bar show fluctuation by year, but in 2020 and 2021 the rating rose higher than before. The highest rating is produced in 2017 chocolate bar

In [51]:
# Question 7. What is the correlation between percentage of cocoa with rating?

fig_pr = px.scatter(df, x= 'cocoa_percent', y='rating', color='cocoa_percent', trendline='ols')

fig_pr.update_layout(height=500, width=800,
                   title='Scatter Plot and Regression of Cocoa Percent and Rating',
                   xaxis_title= 'Cocoa Percent',
                   yaxis_title= 'Rating'
                  )

fig_pr.show()

Scatter plot showed that the rating of chocolate bar decreased by higher percentage of cocoa

In [52]:
# Question 8. What is the correlation between bean origin vs rating?

rating_origin = df[['bean_origin','rating']]

rating_origin_avg = rating_origin.groupby('bean_origin').mean().reset_index()

fig_or = px.scatter(rating_origin_avg, x= 'rating', y='bean_origin', color='rating')

fig_or.update_layout(height=500, width=800,
                   title='Scatter Plot and Regression of Bean Origin and Rating',
                   xaxis_title= 'Avg Rating',
                   yaxis_title= 'Bean Origin'
                  )

fig_or.show()

There are 5 bean origins produced rating below 2.8 and 1 bean origin produced rating higher than 3.6

Average bean origin have rating around 3 - 3.3 produced recommended flavor

In [53]:
# Question 9. What insights can we gain from Indonesia bean origin?

df_indo = df[df['bean_origin']=='Indonesia'].drop(columns='id').sort_values(by='rating', ascending=False)
df_indo
Out[53]:
manufacturer company_location year_reviewed bean_origin bar_name cocoa_percent num_ingredients ingredients review rating
235 Benoit Nihant Belgium 2011 Indonesia Bali, Sukrama Bros. Farm, Melaya, 62hr C 72 3 B,S,C smokey, raspberries, cocoa 4.00
50 Akesson's (Pralus) U.K. 2011 Indonesia Bali (west), Sukrama Family, Melaya area 75 4 B,S,C,L hay, nut, berry 3.75
2456 Willie's Cacao U.K. 2010 Indonesia Java 69 3 B,S,C smokey, fruity, sour 3.75
772 Domori Italy 2007 Indonesia Java, Javablond 70 2 B,S creamy, smokey, tobacco 3.75
243 Beschle (Felchlin) Switzerland 2010 Indonesia Java, Grand Cru 64 3 B,S,C ham-like, smokey, banana 3.50
922 Fossa Singapore 2021 Indonesia Burang, Popayato-Paquat microlot, Goranulo Pro... 72 2 B,S dried fruit (fig), black licorice 3.50
167 Artisan du Chocolat U.K. 2010 Indonesia Bali 72 3 B,S,C black pepper and banana 3.50
267 Bittersweet Origins U.S.A. 2010 Indonesia Bali, Singaraja 75 2 B,S dry, cardamom, medicinal 3.25
2159 Soma Canada 2013 Indonesia Java 70 3 B,S,C creamy, smokey, some fruit 3.25
917 Fossa Singapore 2018 Indonesia Pak Eddy Farm, Yogyakarta Region 70 2 B,S sandy, powdery, late sour 3.00
1835 Pierre Marcolini Belgium 2006 Indonesia Kendem Lembu, Java 72 5 B,S,C,V,L red fruit, smokey, sour 3.00
1899 Pralus France 2019 Indonesia Java 75 4 B,S,C,L fruit, woody, ashey, fatty 3.00
264 Bittersweet Origins U.S.A. 2010 Indonesia Bali, Singaraja 65 2 B,S pastey, off spicey, nutty 2.75
75 Amano U.S.A. 2008 Indonesia Bali, Jembrana 70 4 B,S,C,V sticky, nutty, high acidity 2.75
1637 Molucca U.S.A. 2015 Indonesia Indonesia 70 2 B,S intense, smokey, floral 2.75
2015 Ruket Italy 2021 Indonesia Sumba, lot 02IN 77 2 B,S mild smoke, tart, spice, harsh 2.75
160 Artisan du Chocolat U.K. 2009 Indonesia Java 72 3 B,S,C gritty, very smokey, sour 2.75
1131 Heinde & Verre Netherlands 2019 Indonesia Pulau, Bali 71 4 B,S,C,L rubber, fatty, sandy,grape 2.50
1589 Middlebury U.S.A. 2012 Indonesia Balinese, Java 70 4 B,S*,C,Sa leather, salt 2.50
2457 Willie's Cacao U.K. 2013 Indonesia Java, Indonesian Black 100 <NA> NaN fruit, strong smoke, bitter 2.25
In [54]:
df_indo['manufacturer'].nunique()
Out[54]:
16
In [55]:
df_indo['company_location'].nunique()
Out[55]:
9

Indonesia cocoa bean used by 16 manufacturer and 9 company location, with highest rating (4.00) chocolate bar produced from Benoit Nihant, Belgium in 2011 using 72% cocoa, and ingredient of bean, sugar, cocoa butter.

Lowest rating is 2.25, was produced from Willie's Cacao, UK in 2013 with 100% cocoa that has bitter characteristic.

In [56]:
# Question 10. What most ingredient effect rating of chocolate bar?

df_ingredients = df.copy()

IngredientsDics = {
    "Beans": "B",
    "Sugar": "S",
    "Sweetener": "S*",
    "Cocoa Butter": "C",
    "Vanilla": "V",
    "Lecithin": "L",
    "Salt": "Sa"
}

# Creating new columns for each ingredient and make it numeric

for key in IngredientsDics:
    df_ingredients[key] = np.where(df_ingredients['ingredients'].str.contains(IngredientsDics[key],case=True), 1, 0)

df_ingredients2 = df_ingredients.drop(columns=['cocoa_percent','num_ingredients','id','manufacturer','company_location','year_reviewed','bean_origin','bar_name','ingredients','review']).sort_values(by='rating', ascending=False)


df_ingredients2
Out[56]:
rating Beans Sugar Sweetener Cocoa Butter Vanilla Lecithin Salt
1829 4.0 1 1 1 1 0 0 0
575 4.0 1 1 1 0 0 0 0
150 4.0 1 1 1 0 0 0 0
2389 4.0 1 1 1 1 1 1 0
2047 4.0 1 1 1 1 1 1 0
... ... ... ... ... ... ... ... ...
1587 1.5 1 1 1 1 0 0 1
632 1.0 1 1 1 1 1 1 1
430 1.0 1 1 1 1 1 1 1
590 1.0 1 1 1 1 1 1 1
1679 1.0 1 1 1 1 1 1 0

2530 rows × 8 columns

In [57]:
# build correlation heatmap to know ingredient that highly correlate with rating

df_ingredients2_corr=df_ingredients2.dropna().corr(method='pearson', numeric_only=True)
df_ingredients2_corr

ing_corr = px.imshow(df_ingredients2_corr, color_continuous_scale = 'YlGnBu', text_auto='.2f', aspect= 'auto',
                    title='Correlation Between Ingredients and Rating')
ing_corr.show()

Ingredients that have high correlation with rating are vanilla, salt, and lecithin (negative correlation). It can be concluded that vanilla, salt, and lecithin can produced low rating of chocolate bar

In [ ]: